Library needed :

library(tidyverse)
library(gghighlight)
library(patchwork)

Import Data

Data used in this work is House Sale Prices in Ames, Iowa, USA from 2006-2010, downloaded from Kaggle : https://www.kaggle.com/c/house-prices-advanced-regression-techniques .

df_ori=read.csv('train.csv')
df_ori=tbl_df(df_ori)
df_ori

All columns in dataset :

glimpse(df_ori)
## Rows: 1,460
## Columns: 81
## $ Id            <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 1~
## $ MSSubClass    <int> 60, 20, 60, 70, 60, 50, 20, 60, 50, 190, 20, 60, 20, 20,~
## $ MSZoning      <chr> "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RL", "RM", "R~
## $ LotFrontage   <int> 65, 80, 68, 60, 84, 85, 75, NA, 51, 50, 70, 85, NA, 91, ~
## $ LotArea       <int> 8450, 9600, 11250, 9550, 14260, 14115, 10084, 10382, 612~
## $ Street        <chr> "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", "Pave", ~
## $ Alley         <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ LotShape      <chr> "Reg", "Reg", "IR1", "IR1", "IR1", "IR1", "Reg", "IR1", ~
## $ LandContour   <chr> "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", "Lvl", ~
## $ Utilities     <chr> "AllPub", "AllPub", "AllPub", "AllPub", "AllPub", "AllPu~
## $ LotConfig     <chr> "Inside", "FR2", "Inside", "Corner", "FR2", "Inside", "I~
## $ LandSlope     <chr> "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", "Gtl", ~
## $ Neighborhood  <chr> "CollgCr", "Veenker", "CollgCr", "Crawfor", "NoRidge", "~
## $ Condition1    <chr> "Norm", "Feedr", "Norm", "Norm", "Norm", "Norm", "Norm",~
## $ Condition2    <chr> "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", "Norm", ~
## $ BldgType      <chr> "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", "1Fam", ~
## $ HouseStyle    <chr> "2Story", "1Story", "2Story", "2Story", "2Story", "1.5Fi~
## $ OverallQual   <int> 7, 6, 7, 7, 8, 5, 8, 7, 7, 5, 5, 9, 5, 7, 6, 7, 6, 4, 5,~
## $ OverallCond   <int> 5, 8, 5, 5, 5, 5, 5, 6, 5, 6, 5, 5, 6, 5, 5, 8, 7, 5, 5,~
## $ YearBuilt     <int> 2003, 1976, 2001, 1915, 2000, 1993, 2004, 1973, 1931, 19~
## $ YearRemodAdd  <int> 2003, 1976, 2002, 1970, 2000, 1995, 2005, 1973, 1950, 19~
## $ RoofStyle     <chr> "Gable", "Gable", "Gable", "Gable", "Gable", "Gable", "G~
## $ RoofMatl      <chr> "CompShg", "CompShg", "CompShg", "CompShg", "CompShg", "~
## $ Exterior1st   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Sdng", "VinylSd", "~
## $ Exterior2nd   <chr> "VinylSd", "MetalSd", "VinylSd", "Wd Shng", "VinylSd", "~
## $ MasVnrType    <chr> "BrkFace", "None", "BrkFace", "None", "BrkFace", "None",~
## $ MasVnrArea    <int> 196, 0, 162, 0, 350, 0, 186, 240, 0, 0, 0, 286, 0, 306, ~
## $ ExterQual     <chr> "Gd", "TA", "Gd", "TA", "Gd", "TA", "Gd", "TA", "TA", "T~
## $ ExterCond     <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T~
## $ Foundation    <chr> "PConc", "CBlock", "PConc", "BrkTil", "PConc", "Wood", "~
## $ BsmtQual      <chr> "Gd", "Gd", "Gd", "TA", "Gd", "Gd", "Ex", "Gd", "TA", "T~
## $ BsmtCond      <chr> "TA", "TA", "TA", "Gd", "TA", "TA", "TA", "TA", "TA", "T~
## $ BsmtExposure  <chr> "No", "Gd", "Mn", "No", "Av", "No", "Av", "Mn", "No", "N~
## $ BsmtFinType1  <chr> "GLQ", "ALQ", "GLQ", "ALQ", "GLQ", "GLQ", "GLQ", "ALQ", ~
## $ BsmtFinSF1    <int> 706, 978, 486, 216, 655, 732, 1369, 859, 0, 851, 906, 99~
## $ BsmtFinType2  <chr> "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "Unf", "BLQ", ~
## $ BsmtFinSF2    <int> 0, 0, 0, 0, 0, 0, 0, 32, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0~
## $ BsmtUnfSF     <int> 150, 284, 434, 540, 490, 64, 317, 216, 952, 140, 134, 17~
## $ TotalBsmtSF   <int> 856, 1262, 920, 756, 1145, 796, 1686, 1107, 952, 991, 10~
## $ Heating       <chr> "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", "GasA", ~
## $ HeatingQC     <chr> "Ex", "Ex", "Ex", "Gd", "Ex", "Ex", "Ex", "Ex", "Gd", "E~
## $ CentralAir    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "~
## $ Electrical    <chr> "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "SBrkr", "S~
## $ X1stFlrSF     <int> 856, 1262, 920, 961, 1145, 796, 1694, 1107, 1022, 1077, ~
## $ X2ndFlrSF     <int> 854, 0, 866, 756, 1053, 566, 0, 983, 752, 0, 0, 1142, 0,~
## $ LowQualFinSF  <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ GrLivArea     <int> 1710, 1262, 1786, 1717, 2198, 1362, 1694, 2090, 1774, 10~
## $ BsmtFullBath  <int> 1, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 1, 0, 1, 0, 1, 0, 1,~
## $ BsmtHalfBath  <int> 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ FullBath      <int> 2, 2, 2, 1, 2, 1, 2, 2, 2, 1, 1, 3, 1, 2, 1, 1, 1, 2, 1,~
## $ HalfBath      <int> 1, 0, 1, 0, 1, 1, 0, 1, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 1,~
## $ BedroomAbvGr  <int> 3, 3, 3, 3, 4, 1, 3, 3, 2, 2, 3, 4, 2, 3, 2, 2, 2, 2, 3,~
## $ KitchenAbvGr  <int> 1, 1, 1, 1, 1, 1, 1, 1, 2, 2, 1, 1, 1, 1, 1, 1, 1, 2, 1,~
## $ KitchenQual   <chr> "Gd", "TA", "Gd", "Gd", "Gd", "TA", "Gd", "TA", "TA", "T~
## $ TotRmsAbvGrd  <int> 8, 6, 6, 7, 9, 5, 7, 7, 8, 5, 5, 11, 4, 7, 5, 5, 5, 6, 6~
## $ Functional    <chr> "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", "Typ", ~
## $ Fireplaces    <int> 0, 1, 1, 1, 1, 0, 1, 2, 2, 2, 0, 2, 0, 1, 1, 0, 1, 0, 0,~
## $ FireplaceQu   <chr> NA, "TA", "TA", "Gd", "TA", NA, "Gd", "TA", "TA", "TA", ~
## $ GarageType    <chr> "Attchd", "Attchd", "Attchd", "Detchd", "Attchd", "Attch~
## $ GarageYrBlt   <int> 2003, 1976, 2001, 1998, 2000, 1993, 2004, 1973, 1931, 19~
## $ GarageFinish  <chr> "RFn", "RFn", "RFn", "Unf", "RFn", "Unf", "RFn", "RFn", ~
## $ GarageCars    <int> 2, 2, 2, 3, 3, 2, 2, 2, 2, 1, 1, 3, 1, 3, 1, 2, 2, 2, 2,~
## $ GarageArea    <int> 548, 460, 608, 642, 836, 480, 636, 484, 468, 205, 384, 7~
## $ GarageQual    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "Fa", "G~
## $ GarageCond    <chr> "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "TA", "T~
## $ PavedDrive    <chr> "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "Y", "~
## $ WoodDeckSF    <int> 0, 298, 0, 0, 192, 40, 255, 235, 90, 0, 0, 147, 140, 160~
## $ OpenPorchSF   <int> 61, 0, 42, 35, 84, 30, 57, 204, 0, 4, 0, 21, 0, 33, 213,~
## $ EnclosedPorch <int> 0, 0, 0, 272, 0, 0, 0, 228, 205, 0, 0, 0, 0, 0, 176, 0, ~
## $ X3SsnPorch    <int> 0, 0, 0, 0, 0, 320, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ~
## $ ScreenPorch   <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 176, 0, 0, 0, 0, 0, ~
## $ PoolArea      <int> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,~
## $ PoolQC        <chr> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, ~
## $ Fence         <chr> NA, NA, NA, NA, NA, "MnPrv", NA, NA, NA, NA, NA, NA, NA,~
## $ MiscFeature   <chr> NA, NA, NA, NA, NA, "Shed", NA, "Shed", NA, NA, NA, NA, ~
## $ MiscVal       <int> 0, 0, 0, 0, 0, 700, 0, 350, 0, 0, 0, 0, 0, 0, 0, 0, 700,~
## $ MoSold        <int> 2, 5, 9, 2, 12, 10, 8, 11, 4, 1, 2, 7, 9, 8, 5, 7, 3, 10~
## $ YrSold        <int> 2008, 2007, 2008, 2006, 2008, 2009, 2007, 2009, 2008, 20~
## $ SaleType      <chr> "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "WD", "W~
## $ SaleCondition <chr> "Normal", "Normal", "Normal", "Abnorml", "Normal", "Norm~
## $ SalePrice     <int> 208500, 181500, 223500, 140000, 250000, 143000, 307000, ~

1. Count of one categorical feature

To execute this command, we will analyze the count of each type of foundation used for the houses. The corresponding variable is Foundation which values consist of :

df_ori %>% 
  group_by(Foundation) %>% 
  summarise(nbrow=n()) %>% 
  ggplot(aes(x = Foundation,y=nbrow))+ylim(0,850)+
  geom_bar(stat='identity',fill = "#00AFBB")+
  geom_label(aes(label=nbrow))+
  labs(title =  'Count of Foundation Variable',subtitle = 'Cinder block and poured concrete are the most common foundation', x ='Foundation', y='Count')+theme_bw()+gghighlight(max(nbrow)>600,unhighlighted_params = aes(fill=NULL,color=NULL))

By visualizing the count of Foundation variable, Cinder Block and Poured Concrete are easily recognized as the most common foundation used for houses in Ames,Iowa (2006-2010). Other types of foundation such as slab, stone, and wood are considered rare meanwhile brick and tile foundation is still used for hundred of houses.

2. Distribution of one continuous feature

SalePrice variable is analyzed in this section and plotted by using the histogram. Since it’s very likely that the histogram will be positively skewed, Log transformation will be applied to make the distribution more ‘normal’. Then, the histogram and the boxplot before and after this transformation will be visualized side to side to make sure whether the transformation works.

#Create new columns LogSalePrice that contains result of log-transformation
df_ori=df_ori %>% 
  mutate(LogSalePrice=log(SalePrice))
#Histogram of Sale Price after transformed
p1=df_ori %>% 
  ggplot(aes(x=LogSalePrice))+geom_histogram(bins = 30,fill='white',color='black',aes(y=..density..),size=0.5)+geom_density(fill='#E69F00',alpha=0.2,size=0.7,aes(y=..density..))+theme_minimal()+geom_vline(aes(xintercept=mean(LogSalePrice)),color="blue", linetype="dashed", size=1)+labs(x='log(SalePrice)')
#Histogram of Sale Price before transformed
p2=df_ori %>% 
  ggplot(aes(x=SalePrice))+geom_histogram(bins = 30,fill='white',color='black',aes(y=..density..),size=0.5)+geom_density(fill='#E69F00',alpha=0.2,size=0.7,aes(y=..density..))+theme_minimal()+geom_vline(aes(xintercept=mean(SalePrice)),color="blue", linetype="dashed", size=1)
#Combine both graphs
(p2+p1)+plot_annotation(title='The Histogram of SalePrice Variable',subtitle ='Log transformation successfully transforms the positive-skewed SalePrice distribution into normal distribution')

3. Categorical - Continuous

One thing that immediately appears on my mind when I see this dataset is what is the relationship between Year Sold (YrSold) and Sale Price (LogSalePrice). Please note that even though year is numeric, we can’t say that year is a continuous variable since (2006+2007/2) or (2006*2007) do not have any meaning. That’s why i think it’s more reasonable to classify year as categorical variable.

df_ori %>% ggplot(aes(x=YrSold,y=LogSalePrice))+geom_jitter(color='gray')+geom_smooth(color='black',formula = y ~ s(x, bs = "cs", k = 5))+theme_minimal()+labs(title='Sale Price vs Year Sold',subtitle = 'Houses price tended to be stagnant during 2006-2010 period',y='log(Sale Price)',x='Year Sold')

It’s very clear to see that the prices were stagnant during 2006-2010. This happened because during these year, US was affected by phenomenon called the Great Recession. These years was surely a nightmare for all real estate inventors because the prices didn’t raise year after year.

Another interesting thing to be analyzed is the relationship between neighborhood and the sale price, which will give us informations about where the elite area is, etc. According to Google, neighborhood is local geographic area with similar characteristics. It may be referred to by name (e.g., Brooklyn Heights, Palisades) and have designated boundaries. Common practices to plot this kind of data is by using boxplots, or facets. But in this exercise, i want to try the ridgeline plot.

library(ggridges) #library to plot the ridgline plot
df_ori %>% 
  ggplot(aes(x=SalePrice/1000,y=Neighborhood,fill=Neighborhood))+geom_density_ridges()+theme_minimal()+labs(x='Sale Prices in 1000$',y='Neighborhood')+theme_ridges(font_size = 11,center_axis_labels = TRUE)+theme(legend.position = 'none')+labs(title='Sale Price for Each Neighborhood',subtitle='NRidgHt, NoRidge, StoneBr have the most expensive and diverse prices')

From above graph, we can interpret that StoneBr, NoRidge and NridgHt are considered neighborhoods with the most expensive and diverse house prices meanwhile MeadowV is the cheapest among all neighborhoods.

Cheapest median :

df_ori %>% 
  group_by(Neighborhood) %>% 
  summarise(median=median(SalePrice)) %>% 
  arrange(median) %>% 
  head(5)

Highest median:

df_ori %>% 
  group_by(Neighborhood) %>% 
  summarise(median=median(SalePrice)) %>% 
  arrange(desc(median)) %>% 
  head(5)

4. Continuous - Continuous

One more interesting thing to be analyzed is which Area that have strong correlation to the target variable, SalePrice. First, let’s make a new dataframe that contains all variable name with ‘Area’ in its name and also SalePrice.

df_new=df_ori %>% 
  select(contains('Area'))
df_new=df_new %>% mutate(SalePrice=df_ori$SalePrice)
df_new
cor(df_new)
##               LotArea MasVnrArea GrLivArea GarageArea   PoolArea  SalePrice
## LotArea    1.00000000         NA 0.2631162 0.18040276 0.07767239 0.26384335
## MasVnrArea         NA          1        NA         NA         NA         NA
## GrLivArea  0.26311617         NA 1.0000000 0.46899748 0.17020534 0.70862448
## GarageArea 0.18040276         NA 0.4689975 1.00000000 0.06104727 0.62343144
## PoolArea   0.07767239         NA 0.1702053 0.06104727 1.00000000 0.09240355
## SalePrice  0.26384335         NA 0.7086245 0.62343144 0.09240355 1.00000000

Remove MasVnrArea because it contains NA pearson correlation values, and add few more columns which are the results of log transformation.

df_new = df_new %>% select(-MasVnrArea)
df_new = df_new %>% mutate(LogSalePrice=log(SalePrice),LogLotArea=log(LotArea),LogGrLivArea=log(GrLivArea),LogPoolArea=log(PoolArea),LogGarageArea=log(GarageArea))
cor(df_new)
##                  LotArea GrLivArea GarageArea   PoolArea  SalePrice
## LotArea       1.00000000 0.2631162 0.18040276 0.07767239 0.26384335
## GrLivArea     0.26311617 1.0000000 0.46899748 0.17020534 0.70862448
## GarageArea    0.18040276 0.4689975 1.00000000 0.06104727 0.62343144
## PoolArea      0.07767239 0.1702053 0.06104727 1.00000000 0.09240355
## SalePrice     0.26384335 0.7086245 0.62343144 0.09240355 1.00000000
## LogSalePrice  0.25731989 0.7009267 0.65088756 0.06979781 0.94837373
## LogLotArea    0.69794532 0.3945774 0.32204548 0.09179094 0.38852027
## LogGrLivArea  0.24535747 0.9663720 0.47616021 0.10974441 0.69511807
## LogPoolArea          NaN       NaN        NaN        NaN        NaN
## LogGarageArea        NaN       NaN        NaN        NaN        NaN
##               LogSalePrice LogLotArea LogGrLivArea LogPoolArea LogGarageArea
## LotArea         0.25731989 0.69794532    0.2453575         NaN           NaN
## GrLivArea       0.70092665 0.39457745    0.9663720         NaN           NaN
## GarageArea      0.65088756 0.32204548    0.4761602         NaN           NaN
## PoolArea        0.06979781 0.09179094    0.1097444         NaN           NaN
## SalePrice       0.94837373 0.38852027    0.6951181         NaN           NaN
## LogSalePrice    1.00000000 0.39991774    0.7302549         NaN           NaN
## LogLotArea      0.39991774 1.00000000    0.3854352         NaN           NaN
## LogGrLivArea    0.73025485 0.38543520    1.0000000         NaN           NaN
## LogPoolArea            NaN        NaN          NaN           1           NaN
## LogGarageArea          NaN        NaN          NaN         NaN             1

We get NA values for LogPoolArea and LogGarageArea because not all houses have Garage and Pool (0 area)

df_new=df_new %>% select(-LogPoolArea,-LogGarageArea)
library(reshape2)
get_lower_tri<-function(cormat){
  cormat[lower.tri(cormat)] <- NA
  return(cormat)
}
df_cor=round(cor(df_new),2)
melt_df_cor=get_lower_tri(df_cor)

Heatmap for Pearson Correlation Matrix :

melt_df_cor=melt(melt_df_cor,na.rm = TRUE)
melt_df_cor %>% 
  ggplot(aes(Var2, Var1, fill = value))+
 geom_tile(color = "black")+
 scale_fill_gradient2(low = "blue", high = "red", mid = "white", 
   midpoint = 0, limit = c(-1,1), space = "Lab", 
   name="Pearson\nCorrelation") +
  theme_minimal()+ 
 theme(axis.text.x = element_text(angle = 45, vjust = 1, 
    size = 10, hjust = 1))+geom_text(aes(Var2,Var1,label=value))+labs(title='Pearson Correlation Heatmap',subtitle='LogSalePrice vs LogGrLivArea have the highest correlation',x='',y='')

Based on the heatmap, we can see that all area variables have positive correlations which is reasonable. Highest correlation found in the relationship between LogSalePrice and LogGrLivArea (Ground Living Area), i.e 0.73, slightly higher than correlation between SalePrice and GrLivArea, i.e 0.71. This result proves that GrLivArea (Ground Living Area) is one of the strong predictors for the target variable SalePrice.In the end, let’s do scatter plotting with linear regression for LogGrLivArea and LogSalePrice.

df_ori %>% 
  ggplot()+geom_point(aes(x=log(GrLivArea),y=LogSalePrice,color=Neighborhood))+geom_smooth(aes(x=log(GrLivArea),y=LogSalePrice),method = 'lm',color='black')+theme_minimal()+labs(title= 'LogGrLivArea vs LogSalePrice',subtitle='The linear regression line fits the scatter point nicely',x='LogGrLivArea',y='LogSalePrice')

BONUS

One of my milestones for data visualization that i set since i started learning this data science field is able to create geospatial map visualization. Hence in this section, i really want to create the map visualization for houses in Ames related to this data. Unfortunately, i face two big problems :

To tackle this two problems (which i found out really difficult to solve), the dataset that will be used is Ames Housing data from the library AmesHousing itself, not from the train.csv downloaded from kaggle.

library(AmesHousing) #For extract latitude and longitude information
df_1=make_ames()
ames_df=tbl_df(df_1)
ames_df
library(jpeg)
library(grid)
googlemap <- readJPEG("D:\\Learning_r\\Cleaning_dATA\\Cleaning_Data_Project\\Project_Cleaning\\HW_Day12_Indra\\map_ames.jpg") #reading JPG image for the Ames, Iowa Map taken from openstreetmap.org

Transform the full name of neighborhoods in ames_df dataset into its initial from Kaggle.

Neigh=function(x){
  if (x=='North_Ames'){return('NAmes')}
  if (x=='Gilbert'){return('Gilbert')}
  if (x=='Stone_Brook'){return('StoneBr')}
  if (x=='Northwest_Ames'){return('NWAmes')}
  if (x=='Somerset'){return('Somerst')}
  if (x=='Briardale'){return('BrDale')}
  if (x=='Northpark_Villa'){return('NPkVill')}
  if (x=='Northridge_Heights'){return('NridgHt')}
  if (x=='Bloomington_Heights'){return('Blmngtn')}
  if (x=='Northridge'){return('NoRidge')}
  if (x=='Sawyer_West'){return('SawyerW')}
  if (x=='Sawyer'){return('Sawyer')}
  if (x=='Greens'){return('Greens')}
  if (x=='Brookside'){return('BrkSide')}
  if (x=='Old_Town'){return('OldTown')}
  if (x=='Iowa_DOT_and_Rail_Road'){return('IDOTRR')}
  if (x=='Clear_Creek'){return('ClearCr')}
  if (x=='South_and_West_of_Iowa_State_University'){return('SWISU')}
  if (x=='Edwards'){return('Edwards')}
  if (x=='College_Creek'){return('CollgCr')}
  if (x=='Crawford'){return('Crawfor')}
  if (x=='Blueste'){return('Blueste')}
  if (x=='Mitchell'){return('Mitchel')}
  if (x=='Timberland'){return('Timber')}
  if (x=='Meadow_Village'){return('MeadowV')}
  if (x=='Veenker'){return('Veenker')}
  if (x=='Green_Hills'){return('GreenH')}
  if (x=='Landmark'){return('Landm')}
}
Neigh=Vectorize(Neigh)
ames_df=ames_df %>% mutate(Neigh_Initial=Neigh(Neighborhood))
ames_df_locate=ames_df %>% group_by(Neigh_Initial) %>% 
  summarise(Latitude=mean(Latitude),Longitude=mean(Longitude))
ames_df_locate

I create this new dataframe in order to label the 28 neighborhoods (not 2930 labels, but only 28).

ames_df %>% filter(!is.na(Latitude)) %>%
  ggplot(aes(x=Longitude,y=Latitude,color=Neigh_Initial)) + 
    annotation_custom(rasterGrob(googlemap,
                               width = unit(1,"npc"),
                               height = unit(1,"npc")),
                    -Inf, Inf, -Inf, Inf) +
  scale_y_continuous(limits = c(41.9700,42.0800)) +
  scale_x_continuous(limits = c(-93.7100,-93.5600)) +
  geom_point()+geom_label(data=ames_df_locate,label=ames_df_locate$Neigh_Initial,alpha=0.8,size=3)+labs(title='Map of Neighborhoods in Ames,Iowa')

And this is the map of Neighborhoods in Ames, Iowa. Next, we will plot the sale prices based on this map

ames_df %>% filter(!is.na(Latitude)) %>%
  ggplot(aes(x=Longitude,y=Latitude,color=cut_number(Sale_Price/1000,6))) + 
    annotation_custom(rasterGrob(googlemap,
                               width = unit(1,"npc"),
                               height = unit(1,"npc")),
                    -Inf, Inf, -Inf, Inf) +
  scale_y_continuous(limits = c(41.9700,42.0800)) +
  scale_x_continuous(limits = c(-93.7100,-93.5600)) +
  geom_point() + 
  labs(title="House Prices by Location",subtitle='Highest prices found in NridgHt, NoRidge, and StoneBr') + 
  scale_color_discrete(name="Price in $1000")

The conclusion we get from this map visualization is similar as before, i.e The neighborhoods with the highest prices are NridgHt, NoRidge StoneBr, Timber, Somerst meanwhile the lowest prices are Meadowv, IDOTRR, BrDale, OldTown, and Edwards.

P.s. : maybe in the next work i will adjust the neighborhood labels a little bit so it becomes more clear to see.

Credit to these webs, that help me by giving insights about this data and ggplot2 ,especially the procedures to create geospatial visualization :